Yiwei Gu HW1 092625

#1

read data 2002 and 2022:

data2002 <- read.csv("~/Desktop/2002.csv")
data2022 <- read.csv("~/Desktop/2022.csv")

check dimemsions, headers and tail:

dim(data2002)
[1] 15976    22
dim(data2022)
[1] 59918    22
head(data2002)
        Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    Units
1 01/05/2002    AQS 60010007   1                           25.1 ug/m3 LC
2 01/06/2002    AQS 60010007   1                           31.6 ug/m3 LC
3 01/08/2002    AQS 60010007   1                           21.4 ug/m3 LC
4 01/11/2002    AQS 60010007   1                           25.9 ug/m3 LC
5 01/14/2002    AQS 60010007   1                           34.5 ug/m3 LC
6 01/17/2002    AQS 60010007   1                           41.0 ug/m3 LC
  Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
1              81       Livermore               1              100
2              93       Livermore               1              100
3              74       Livermore               1              100
4              82       Livermore               1              100
5              98       Livermore               1              100
6             115       Livermore               1              100
  AQS.Parameter.Code AQS.Parameter.Description Method.Code
1              88101  PM2.5 - Local Conditions         120
2              88101  PM2.5 - Local Conditions         120
3              88101  PM2.5 - Local Conditions         120
4              88101  PM2.5 - Local Conditions         120
5              88101  PM2.5 - Local Conditions         120
6              88101  PM2.5 - Local Conditions         120
                     Method.Description CBSA.Code
1 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
2 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
3 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
4 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
5 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
6 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
                          CBSA.Name State.FIPS.Code      State County.FIPS.Code
1 San Francisco-Oakland-Hayward, CA               6 California                1
2 San Francisco-Oakland-Hayward, CA               6 California                1
3 San Francisco-Oakland-Hayward, CA               6 California                1
4 San Francisco-Oakland-Hayward, CA               6 California                1
5 San Francisco-Oakland-Hayward, CA               6 California                1
6 San Francisco-Oakland-Hayward, CA               6 California                1
   County Site.Latitude Site.Longitude
1 Alameda      37.68753      -121.7842
2 Alameda      37.68753      -121.7842
3 Alameda      37.68753      -121.7842
4 Alameda      37.68753      -121.7842
5 Alameda      37.68753      -121.7842
6 Alameda      37.68753      -121.7842
tail(data2002)
            Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    Units
15971 12/10/2002    AQS 61131003   1                             15 ug/m3 LC
15972 12/13/2002    AQS 61131003   1                             15 ug/m3 LC
15973 12/22/2002    AQS 61131003   1                              1 ug/m3 LC
15974 12/25/2002    AQS 61131003   1                             23 ug/m3 LC
15975 12/28/2002    AQS 61131003   1                              5 ug/m3 LC
15976 12/31/2002    AQS 61131003   1                              6 ug/m3 LC
      Daily.AQI.Value      Local.Site.Name Daily.Obs.Count Percent.Complete
15971              62 Woodland-Gibson Road               1              100
15972              62 Woodland-Gibson Road               1              100
15973               6 Woodland-Gibson Road               1              100
15974              77 Woodland-Gibson Road               1              100
15975              28 Woodland-Gibson Road               1              100
15976              33 Woodland-Gibson Road               1              100
      AQS.Parameter.Code AQS.Parameter.Description Method.Code
15971              88101  PM2.5 - Local Conditions         117
15972              88101  PM2.5 - Local Conditions         117
15973              88101  PM2.5 - Local Conditions         117
15974              88101  PM2.5 - Local Conditions         117
15975              88101  PM2.5 - Local Conditions         117
15976              88101  PM2.5 - Local Conditions         117
                         Method.Description CBSA.Code
15971 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15972 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15973 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15974 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15975 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15976 R & P Model 2000 PM2.5 Sampler w/WINS     40900
                                    CBSA.Name State.FIPS.Code      State
15971 Sacramento--Roseville--Arden-Arcade, CA               6 California
15972 Sacramento--Roseville--Arden-Arcade, CA               6 California
15973 Sacramento--Roseville--Arden-Arcade, CA               6 California
15974 Sacramento--Roseville--Arden-Arcade, CA               6 California
15975 Sacramento--Roseville--Arden-Arcade, CA               6 California
15976 Sacramento--Roseville--Arden-Arcade, CA               6 California
      County.FIPS.Code County Site.Latitude Site.Longitude
15971              113   Yolo      38.66121      -121.7327
15972              113   Yolo      38.66121      -121.7327
15973              113   Yolo      38.66121      -121.7327
15974              113   Yolo      38.66121      -121.7327
15975              113   Yolo      38.66121      -121.7327
15976              113   Yolo      38.66121      -121.7327
head(data2022)
        Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    Units
1 01/01/2022    AQS 60010007   3                           12.7 ug/m3 LC
2 01/02/2022    AQS 60010007   3                           13.9 ug/m3 LC
3 01/03/2022    AQS 60010007   3                            7.1 ug/m3 LC
4 01/04/2022    AQS 60010007   3                            3.7 ug/m3 LC
5 01/05/2022    AQS 60010007   3                            4.2 ug/m3 LC
6 01/06/2022    AQS 60010007   3                            3.8 ug/m3 LC
  Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
1              58       Livermore               1              100
2              60       Livermore               1              100
3              39       Livermore               1              100
4              21       Livermore               1              100
5              23       Livermore               1              100
6              21       Livermore               1              100
  AQS.Parameter.Code AQS.Parameter.Description Method.Code
1              88101  PM2.5 - Local Conditions         170
2              88101  PM2.5 - Local Conditions         170
3              88101  PM2.5 - Local Conditions         170
4              88101  PM2.5 - Local Conditions         170
5              88101  PM2.5 - Local Conditions         170
6              88101  PM2.5 - Local Conditions         170
                    Method.Description CBSA.Code
1 Met One BAM-1020 Mass Monitor w/VSCC     41860
2 Met One BAM-1020 Mass Monitor w/VSCC     41860
3 Met One BAM-1020 Mass Monitor w/VSCC     41860
4 Met One BAM-1020 Mass Monitor w/VSCC     41860
5 Met One BAM-1020 Mass Monitor w/VSCC     41860
6 Met One BAM-1020 Mass Monitor w/VSCC     41860
                          CBSA.Name State.FIPS.Code      State County.FIPS.Code
1 San Francisco-Oakland-Hayward, CA               6 California                1
2 San Francisco-Oakland-Hayward, CA               6 California                1
3 San Francisco-Oakland-Hayward, CA               6 California                1
4 San Francisco-Oakland-Hayward, CA               6 California                1
5 San Francisco-Oakland-Hayward, CA               6 California                1
6 San Francisco-Oakland-Hayward, CA               6 California                1
   County Site.Latitude Site.Longitude
1 Alameda      37.68753      -121.7842
2 Alameda      37.68753      -121.7842
3 Alameda      37.68753      -121.7842
4 Alameda      37.68753      -121.7842
5 Alameda      37.68753      -121.7842
6 Alameda      37.68753      -121.7842
tail(data2022)
            Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    Units
59913 12/01/2022    AQS 61131003   1                            3.4 ug/m3 LC
59914 12/07/2022    AQS 61131003   1                            3.8 ug/m3 LC
59915 12/13/2022    AQS 61131003   1                            6.0 ug/m3 LC
59916 12/19/2022    AQS 61131003   1                           34.8 ug/m3 LC
59917 12/25/2022    AQS 61131003   1                           23.2 ug/m3 LC
59918 12/31/2022    AQS 61131003   1                            1.0 ug/m3 LC
      Daily.AQI.Value      Local.Site.Name Daily.Obs.Count Percent.Complete
59913              19 Woodland-Gibson Road               1              100
59914              21 Woodland-Gibson Road               1              100
59915              33 Woodland-Gibson Road               1              100
59916              99 Woodland-Gibson Road               1              100
59917              77 Woodland-Gibson Road               1              100
59918               6 Woodland-Gibson Road               1              100
      AQS.Parameter.Code AQS.Parameter.Description Method.Code
59913              88101  PM2.5 - Local Conditions         145
59914              88101  PM2.5 - Local Conditions         145
59915              88101  PM2.5 - Local Conditions         145
59916              88101  PM2.5 - Local Conditions         145
59917              88101  PM2.5 - Local Conditions         145
59918              88101  PM2.5 - Local Conditions         145
                                         Method.Description CBSA.Code
59913 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59914 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59915 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59916 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59917 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59918 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
                                    CBSA.Name State.FIPS.Code      State
59913 Sacramento--Roseville--Arden-Arcade, CA               6 California
59914 Sacramento--Roseville--Arden-Arcade, CA               6 California
59915 Sacramento--Roseville--Arden-Arcade, CA               6 California
59916 Sacramento--Roseville--Arden-Arcade, CA               6 California
59917 Sacramento--Roseville--Arden-Arcade, CA               6 California
59918 Sacramento--Roseville--Arden-Arcade, CA               6 California
      County.FIPS.Code County Site.Latitude Site.Longitude
59913              113   Yolo      38.66121      -121.7327
59914              113   Yolo      38.66121      -121.7327
59915              113   Yolo      38.66121      -121.7327
59916              113   Yolo      38.66121      -121.7327
59917              113   Yolo      38.66121      -121.7327
59918              113   Yolo      38.66121      -121.7327

check variable names and types

str(data2002)
'data.frame':   15976 obs. of  22 variables:
 $ Date                          : chr  "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site.ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Daily.Mean.PM2.5.Concentration: num  25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
 $ Units                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ Daily.AQI.Value               : int  81 93 74 82 98 115 89 62 69 107 ...
 $ Local.Site.Name               : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ Daily.Obs.Count               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Percent.Complete              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS.Parameter.Code            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS.Parameter.Description     : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ Method.Code                   : int  120 120 120 120 120 120 120 120 120 120 ...
 $ Method.Description            : chr  "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" ...
 $ CBSA.Code                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA.Name                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ State.FIPS.Code               : int  6 6 6 6 6 6 6 6 6 6 ...
 $ State                         : chr  "California" "California" "California" "California" ...
 $ County.FIPS.Code              : int  1 1 1 1 1 1 1 1 1 1 ...
 $ County                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ Site.Latitude                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ Site.Longitude                : num  -122 -122 -122 -122 -122 ...
str(data2022)
'data.frame':   59918 obs. of  22 variables:
 $ Date                          : chr  "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site.ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Daily.Mean.PM2.5.Concentration: num  12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
 $ Units                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ Daily.AQI.Value               : int  58 60 39 21 23 21 13 38 59 55 ...
 $ Local.Site.Name               : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ Daily.Obs.Count               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Percent.Complete              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS.Parameter.Code            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS.Parameter.Description     : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ Method.Code                   : int  170 170 170 170 170 170 170 170 170 170 ...
 $ Method.Description            : chr  "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" ...
 $ CBSA.Code                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA.Name                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ State.FIPS.Code               : int  6 6 6 6 6 6 6 6 6 6 ...
 $ State                         : chr  "California" "California" "California" "California" ...
 $ County.FIPS.Code              : int  1 1 1 1 1 1 1 1 1 1 ...
 $ County                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ Site.Latitude                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ Site.Longitude                : num  -122 -122 -122 -122 -122 ...
table(data2002$Daily.Mean.PM2.5.Concentration)

    0   0.1   0.2   0.3   0.4   0.5   0.6   0.7   0.8   0.9     1   1.1   1.2 
    3     7    18    23    19    28    31    21    30    35    74    32    28 
  1.3   1.4   1.5   1.6   1.7   1.8   1.9     2   2.1   2.2   2.3   2.4   2.5 
   25    24    29    25    43    29    38    97    26    45    36    39    32 
  2.6   2.7   2.8   2.9     3   3.1   3.2   3.3   3.4   3.5   3.6   3.7   3.8 
   37    38    42    40   167    43    44    46    29    48    46    51    49 
  3.9     4   4.1   4.2   4.3   4.4   4.5   4.6   4.7   4.8   4.9     5   5.1 
   52   227    41    61    60    54    62    66    49    49    57   267    55 
  5.2   5.3   5.4   5.5   5.6   5.7   5.8   5.9     6   6.1   6.2   6.3   6.4 
   63    58    66    62    55    53    63    55   332    44    64    54    57 
  6.5   6.6   6.7   6.8   6.9     7   7.1   7.2   7.3   7.4   7.5   7.6   7.7 
   57    45    61    63    51   309    65    53    70    48    63    63    71 
  7.8   7.9     8   8.1   8.2   8.3   8.4   8.5   8.6   8.7   8.8   8.9     9 
   58    55   302    51    63    49    43    65    55    56    61    73   290 
  9.1   9.2   9.3   9.4   9.5   9.6   9.7   9.8   9.9    10  10.1  10.2  10.3 
   66    55    51    54    78    57    49    72    49   249    57    68    54 
 10.4  10.5  10.6  10.7  10.8  10.9    11  11.1  11.2  11.3  11.4  11.5  11.6 
   62    56    52    50    56    51   217    62    47    51    49    71    50 
 11.7  11.8  11.9    12  12.1  12.2  12.3  12.4  12.5  12.6  12.7  12.8  12.9 
   55    58    39   209    46    54    53    40    54    42    45    62    50 
   13  13.1  13.2  13.3  13.4  13.5  13.6  13.7  13.8  13.9    14  14.1  14.2 
  190    50    47    55    45    61    53    49    45    40   177    47    43 
 14.3  14.4  14.5  14.6  14.7  14.8  14.9    15  15.1  15.2  15.3  15.4  15.5 
   38    42    49    38    57    46    48   138    40    43    50    38    45 
 15.6  15.7  15.8  15.9    16  16.1  16.2  16.3  16.4  16.5  16.6  16.7  16.8 
   47    46    39    38   129    35    34    37    36    36    32    35    29 
 16.9    17  17.1  17.2  17.3  17.4  17.5  17.6  17.7  17.8  17.9    18  18.1 
   34   105    28    33    23    46    36    31    29    31    26    79    27 
 18.2  18.3  18.4  18.5  18.6  18.7  18.8  18.9    19  19.1  19.2  19.3  19.4 
   28    37    21    32    25    48    35    23    88    36    34    29    28 
 19.5  19.6  19.7  19.8  19.9    20  20.1  20.2  20.3  20.4  20.5  20.6  20.7 
   31    21    27    31    20    85    24    21    20    23    26    22    18 
 20.8  20.9    21  21.1  21.2  21.3  21.4  21.5  21.6  21.7  21.8  21.9    22 
   33    24    70    17    26    26    24    31    13    23    20    24    62 
 22.1  22.2  22.3  22.4  22.5  22.6  22.7  22.8  22.9    23  23.1  23.2  23.3 
   15    21    24    21    27    31    23    27    12    68    20    25    20 
 23.4  23.5  23.6  23.7  23.8  23.9    24  24.1  24.2  24.3  24.4  24.5  24.6 
   14    18    29    19    20    13    55    17    20    11    13    24    23 
 24.7  24.8  24.9    25  25.1  25.2  25.3  25.4  25.5  25.6  25.7  25.8  25.9 
   15    17    13    40    14    12    19    12    25     9    22    15    23 
   26  26.1  26.2  26.3  26.4  26.5  26.6  26.7  26.8  26.9    27  27.1  27.2 
   31    17    24    21    15    29    11    20    25    12    48    16    12 
 27.3  27.4  27.5  27.6  27.7  27.8  27.9    28  28.1  28.2  28.3  28.4  28.5 
   24     3    11    15    11    13     8    42    14    11    11     8    16 
 28.6  28.7  28.8  28.9    29  29.1  29.2  29.3  29.4  29.5  29.6  29.7  29.8 
    7    12    10    11    21    13    12    12    10    13    11     9    18 
 29.9    30  30.1  30.2  30.3  30.4  30.5  30.6  30.7  30.8  30.9    31  31.1 
   12    23    11    14     7    12    11     9     7    11     6    24     9 
 31.2  31.3  31.4  31.5  31.6  31.7  31.8  31.9    32  32.1  32.2  32.3  32.4 
    3    13     7    15     8    12     8    12    31    10     2    10    15 
 32.5  32.6  32.7  32.8  32.9    33  33.1  33.2  33.3  33.4  33.5  33.6  33.7 
   12    12     8    10     5    30     6     9     9     8     5     9     3 
 33.8  33.9    34  34.1  34.2  34.3  34.4  34.5  34.6  34.7  34.8  34.9    35 
   12     4    31    10     9     8     7    11     6    11     5     5    20 
 35.1  35.2  35.3  35.4  35.5  35.6  35.7  35.8  35.9    36  36.1  36.2  36.3 
    6    13     1     7     6    13     6    15     8    17    10    10     9 
 36.4  36.5  36.6  36.7  36.8  36.9    37  37.1  37.2  37.3  37.4  37.5  37.6 
    3     8     6     7     7     8    15     3    12     7     6     2     4 
 37.7  37.8  37.9    38  38.1  38.2  38.3  38.4  38.5  38.6  38.7  38.8  38.9 
    6     3     4    25     9     9     2     4     4     4     6     9     3 
   39  39.1  39.2  39.3  39.4  39.5  39.6  39.7  39.8  39.9    40  40.1  40.2 
   13     6     6     7     4     9     9     5     5     6    16     5     5 
 40.3  40.4  40.5  40.6  40.7  40.8  40.9    41  41.1  41.2  41.3  41.4  41.5 
    5     2     9     7     6     5     3    16     4     6     6     5     6 
 41.6  41.7  41.8  41.9    42  42.1  42.2  42.3  42.4  42.5  42.6  42.7  42.8 
    5     8     4     4    22     2     6     7     3     4     1     7     9 
 42.9    43  43.1  43.2  43.3  43.4  43.5  43.6  43.7  43.8  43.9    44  44.1 
    8    18     3     1     2     5     7     3     6     4     3    10     9 
 44.2  44.3  44.4  44.5  44.6  44.7  44.8  44.9    45  45.1  45.2  45.3  45.4 
    4     7     2     6     4     5     4     3    16     4     7     5     2 
 45.5  45.6  45.7  45.8  45.9    46  46.1  46.2  46.3  46.4  46.5  46.6  46.7 
    5     3     3     5     1    17     2     1     7     4     4     4     4 
 46.8  46.9    47  47.1  47.2  47.3  47.4  47.5  47.6  47.7  47.8  47.9    48 
    5     4     7     4     3     6     3     5     3     5     4     2    12 
 48.1  48.3  48.4  48.5  48.7  48.8  48.9    49  49.1  49.2  49.3  49.4  49.5 
    6     2     7     2     6     4     5    10     1     5     1     7     5 
 49.6  49.7  49.9    50  50.1  50.2  50.3  50.4  50.5  50.6  50.7  50.8  50.9 
    2     6     2    13     2     1     4     2     5     1     3     1     2 
   51  51.1  51.2  51.3  51.4  51.5  51.6  51.7  51.8    52  52.1  52.2  52.3 
   10     2     5     9     3     1     4     3     3     2     4     1     5 
 52.4  52.5  52.6  52.7  52.8  52.9    53  53.1  53.2  53.3  53.4  53.5  53.6 
    3     4     4     1     4     3    12     2     6     3     3     4     6 
 53.7  53.9    54  54.1  54.3  54.4  54.5  54.6  54.7  54.8  54.9    55  55.1 
    4     1    10     1     2     6     1     4     2     4     1     3     6 
 55.2  55.3  55.4  55.6  55.7  55.8    56  56.1  56.3  56.5  56.6  56.7  56.8 
    6     4     3     2     1     1     8     2     5     3     3     3     3 
 56.9    57  57.1  57.2  57.3  57.4  57.5  57.6  57.7  57.8  57.9    58  58.1 
    4     9     2     3     3     6     2     2     5     1     1     7     2 
 58.2  58.4  58.5  58.6  58.7  58.8  58.9    59  59.2  59.3  59.4  59.5  59.6 
    3     2     3     5     1     3     2     7     4     1     2     3     2 
 59.7    60  60.2  60.5  60.8  60.9    61  61.1  61.4  61.6  61.7  61.8  61.9 
    5     6     1     1     2     2     8     1     1     3     2     3     2 
   62  62.1  62.2  62.3  62.5  62.6  62.7    63  63.3  63.5  63.6  63.7  63.8 
    9     2     1     1     1     2     1     7     1     1     1     2     1 
 63.9    64  64.1  64.3  64.4  64.5  64.6  64.7  64.8  64.9    65  65.1  65.2 
    4     7     2     1     3     1     1     2     2     1     7     3     2 
 65.3  65.4  65.7    66  66.2  66.3  66.6  66.8  66.9    67  67.2  67.3  67.6 
    2     1     1     7     3     5     1     2     1     3     1     1     2 
 67.7  67.8  67.9    68  68.1  68.2  68.6  68.7  68.8    69  69.1  69.2  69.6 
    3     2     1     5     2     1     1     4     2     6     1     1     3 
 69.7  69.9    70  70.1  70.2  70.3  70.5  70.8  70.9    71  71.1  71.2  71.3 
    2     2     2     1     1     1     1     1     2     3     3     1     1 
 71.7  71.8  71.9    72  72.3  72.4  72.9    73  73.1  73.2  73.5  73.6  73.9 
    1     3     1     3     1     2     1     5     2     2     1     1     2 
   74  74.4  74.7    75  75.1  75.3  75.5  75.7  75.8    76  76.1  76.3  76.4 
    2     1     1     2     2     1     1     1     1     4     1     1     2 
 76.6  76.7  76.8  76.9    77  77.2  77.4  77.6    78  78.1  78.3  78.5  79.3 
    1     1     1     1     3     2     1     1     1     2     1     1     1 
 79.5    80  80.3  80.4  80.7  80.9    81  81.1  81.3  81.6    82  82.1    83 
    1     2     1     1     2     1     1     1     1     1     3     1     2 
 83.1  83.9    84  84.1  84.2  84.4  84.6    85  85.3  85.6  85.7    86  86.2 
    1     1     2     1     2     1     1     1     1     2     1     1     1 
 86.6    87  87.4  87.5    88  88.6  89.3  89.6  89.8  90.7    91  91.7  92.5 
    1     2     1     1     1     1     1     1     1     1     2     1     1 
 93.9 102.7 104.3 
    1     1     1 
table(data2022$Daily.Mean.PM2.5.Concentration)

 -6.7  -6.3  -5.1  -4.7  -4.1  -3.1    -3  -2.2  -2.1    -2  -1.9  -1.7  -1.5 
    1     1     1     2     1     1     1     2     1     1     2     1     1 
 -1.4  -1.3  -1.2  -1.1    -1  -0.9  -0.8  -0.7  -0.6  -0.5  -0.4  -0.3  -0.2 
    6     5     4     4    11     4    12     8    17    17    23    25    32 
 -0.1     0   0.1   0.2   0.3   0.4   0.5   0.6   0.7   0.8   0.9     1   1.1 
   31   128    49    89    95    83   158   142   177   170   154   261   180 
  1.2   1.3   1.4   1.5   1.6   1.7   1.8   1.9     2   2.1   2.2   2.3   2.4 
  283   226   239   330   302   344   323   343   474   384   418   391   404 
  2.5   2.6   2.7   2.8   2.9     3   3.1   3.2   3.3   3.4   3.5   3.6   3.7 
  501   414   545   432   447   607   498   610   510   477   616   541   589 
  3.8   3.9     4   4.1   4.2   4.3   4.4   4.5   4.6   4.7   4.8   4.9     5 
  510   493   721   506   628   526   503   640   542   664   500   505   690 
  5.1   5.2   5.3   5.4   5.5   5.6   5.7   5.8   5.9     6   6.1   6.2   6.3 
  501   644   480   498   627   508   614   510   444   658   457   577   474 
  6.4   6.5   6.6   6.7   6.8   6.9     7   7.1   7.2   7.3   7.4   7.5   7.6 
  479   572   465   567   469   406   554   424   522   409   415   511   423 
  7.7   7.8   7.9     8   8.1   8.2   8.3   8.4   8.5   8.6   8.7   8.8   8.9 
  512   358   375   487   366   476   394   338   462   373   458   371   359 
    9   9.1   9.2   9.3   9.4   9.5   9.6   9.7   9.8   9.9    10  10.1  10.2 
  401   343   403   372   345   394   325   404   295   294   383   283   346 
 10.3  10.4  10.5  10.6  10.7  10.8  10.9    11  11.1  11.2  11.3  11.4  11.5 
  292   288   324   276   300   241   265   312   261   301   235   215   288 
 11.6  11.7  11.8  11.9    12  12.1  12.2  12.3  12.4  12.5  12.6  12.7  12.8 
  207   249   223   201   248   196   243   206   184   203   170   209   158 
 12.9    13  13.1  13.2  13.3  13.4  13.5  13.6  13.7  13.8  13.9    14  14.1 
  178   222   159   233   175   165   180   135   176   114   144   168   132 
 14.2  14.3  14.4  14.5  14.6  14.7  14.8  14.9    15  15.1  15.2  15.3  15.4 
  142   145   125   123   119   136   137   123   146   104   111   109    99 
 15.5  15.6  15.7  15.8  15.9    16  16.1  16.2  16.3  16.4  16.5  16.6  16.7 
  132   105   129    91    91   120    84    76    82    81   106    80    89 
 16.8  16.9    17  17.1  17.2  17.3  17.4  17.5  17.6  17.7  17.8  17.9    18 
   80    76    96    63   104    70    73    71    67    74    58    42    85 
 18.1  18.2  18.3  18.4  18.5  18.6  18.7  18.8  18.9    19  19.1  19.2  19.3 
   42    65    44    44    56    54    67    41    55    53    36    51    31 
 19.4  19.5  19.6  19.7  19.8  19.9    20  20.1  20.2  20.3  20.4  20.5  20.6 
   40    47    39    34    35    30    52    27    44    41    31    53    37 
 20.7  20.8  20.9    21  21.1  21.2  21.3  21.4  21.5  21.6  21.7  21.8  21.9 
   39    27    33    44    32    36    36    28    23    33    42    28    23 
   22  22.1  22.2  22.3  22.4  22.5  22.6  22.7  22.8  22.9    23  23.1  23.2 
   30    38    40    18    25    31    24    31    28    19    24    18    29 
 23.3  23.4  23.5  23.6  23.7  23.8  23.9    24  24.1  24.2  24.3  24.4  24.5 
   21    24    21    20    22    11    19    22    22    16    16    15    17 
 24.6  24.7  24.8  24.9    25  25.1  25.2  25.3  25.4  25.5  25.6  25.7  25.8 
   19    19    16    15    13    17    26    22    20    21    16    17    12 
 25.9    26  26.1  26.2  26.3  26.4  26.5  26.6  26.7  26.8  26.9    27  27.1 
   16    10    13    14    22    18    16    18    20    15    13    19    15 
 27.2  27.3  27.4  27.5  27.6  27.7  27.8  27.9    28  28.1  28.2  28.3  28.4 
   12    10    16    19    15    12    11     9    15    15    12    12     9 
 28.5  28.6  28.7  28.8  28.9    29  29.1  29.2  29.3  29.4  29.5  29.6  29.7 
   18     9    15    12    12    11     9    19    12    11     9    12     8 
 29.8  29.9    30  30.1  30.2  30.3  30.4  30.5  30.6  30.7  30.8  30.9    31 
    9     7     5    15    10     5     8     7    11     8    11     9    19 
 31.1  31.2  31.3  31.4  31.5  31.6  31.7  31.8  31.9    32  32.1  32.2  32.3 
    6    13     8     4     9    10     9     9    11    11     4     4     8 
 32.4  32.5  32.6  32.7  32.8  32.9    33  33.1  33.2  33.3  33.4  33.5  33.6 
    3     7     5    10     7     8     6    13     7     6     5    17     5 
 33.7  33.8  33.9    34  34.1  34.2  34.3  34.4  34.5  34.6  34.7  34.8  34.9 
    6    12     4     6     7     4     3     6     6     9     5     5     6 
   35  35.1  35.2  35.3  35.4  35.5  35.6  35.7  35.8  35.9    36  36.1  36.2 
    8     4     8     3     2    11     3     3     2    10     8     4     9 
 36.3  36.4  36.5  36.6  36.7  36.8    37  37.1  37.2  37.3  37.4  37.5  37.6 
    4     5     6     2     3     5    10     5     2     5     3     5     5 
 37.7  37.8  37.9    38  38.1  38.2  38.3  38.4  38.5  38.6  38.7  38.8  38.9 
    3     4     3     5     5     5     1     4     5     3     3     6     1 
   39  39.1  39.2  39.3  39.4  39.5  39.6  39.7  39.8  39.9    40  40.1  40.2 
    4     5     4     4     3     8     3     6     5     1     3     4     2 
 40.3  40.4  40.5  40.6  40.7  40.8  40.9    41  41.1  41.2  41.3  41.4  41.5 
    2     1     4     4     7     8     2     3     1     5     1     2     3 
 41.6  41.7  41.8  41.9    42  42.1  42.2  42.3  42.4  42.5  42.6  42.7  42.8 
    3     2     4     2     3     3     3     2     2     1     2     4     1 
 42.9    43  43.1  43.2  43.3  43.4  43.5  43.6  43.7  43.8  43.9    44  44.1 
    1     2     5     4     2     3     5     1     4     2     1     5     2 
 44.2  44.3  44.4  44.5  44.8    45  45.2  45.4  45.5  45.7  45.9  46.1  46.2 
    2     1     1     3     2     1     2     1     1     2     1     3     4 
 46.3  46.6  46.7  46.8  46.9    47  47.1  47.2  47.4  47.5  47.8  47.9    48 
    3     1     4     1     1     1     2     1     1     1     1     1     2 
 48.2  48.3  48.5  48.6  48.7  48.9    49  49.1  49.2  49.4  49.7  49.8    50 
    1     1     1     1     3     1     3     1     1     1     1     1     2 
 50.2  50.5  50.8  51.2  51.4  51.5  51.8  51.9  52.2  52.5  52.6  52.8  52.9 
    1     2     1     2     1     1     1     1     1     1     2     1     3 
 53.2  53.3  53.5  53.6  53.8  53.9    54  54.5  54.6  54.7  54.9  55.1  55.6 
    1     1     1     1     1     2     1     1     2     3     1     1     1 
 55.8    56  56.3  57.8  58.1  58.6    59  59.3    60  61.5  61.7  62.3  62.4 
    1     1     1     1     1     2     1     1     1     2     1     1     1 
 62.5  62.7  62.8  62.9    63  63.7    64  64.2  64.4  66.2  66.6  66.7  68.6 
    1     1     1     1     1     1     2     1     1     2     1     1     1 
   69  69.1    70  70.7  71.8    73  73.5  73.8  73.9    74  75.3  75.5  76.3 
    1     1     1     1     1     3     2     1     1     1     2     1     1 
 77.2  77.5    78    81  83.5  83.6  84.4  84.5  85.2  87.3  88.6  88.8  89.2 
    1     1     1     1     1     1     1     1     1     1     1     1     1 
 89.8  90.7  91.4  92.4  96.6  97.2  98.2 101.4 102.3   103   105 106.4 107.2 
    1     1     1     1     1     1     1     2     1     1     1     1     1 
  108 108.8 109.3 109.5 110.2 111.1 111.6 113.6 118.7 119.2   122 133.3 133.8 
    1     1     1     1     1     1     1     1     1     1     1     1     1 
139.2 140.3 141.1 150.9 152.4 155.2 168.7 177.1 178.6 181.7 212.8 218.2 243.9 
    1     1     1     1     1     1     1     1     1     1     1     1     1 
244.7 246.2 296.3 302.5 
    1     1     1     1 
summary(data2002$Daily.Mean.PM2.5.Concentration)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    7.00   12.00   16.12   20.50  104.30 
summary(data2022$Daily.Mean.PM2.5.Concentration)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -6.700   4.100   6.800   8.414  10.700 302.500 
hist(data2002$Daily.Mean.PM2.5.Concentration)

hist(data2022$Daily.Mean.PM2.5.Concentration)

boxplot(data2002$Daily.Mean.PM2.5.Concentration)

boxplot(data2022$Daily.Mean.PM2.5.Concentration)

Summary: dataset 2002 has 15976 rows and 22 columns, dataset2022 has 59703 rows and 23 columns. The key variables are: Date Local / Date → the sampling date (daily), the daily average PM2.5 concentration (µg/m³). State Name / County Name / City Name → geographic identifiers. Site Num (sometimes along with County Code, Site Code) → identifies the monitoring site. Latitude / Longitude → location of the monitoring site (useful for mapping). Sample Duration → indicates if measurement is 24-hr, 1-hr, etc. (for PM2.5, often 24-hr avg). POC (Parameter Occurrence Code) → distinguishes multiple instruments at the same site. Among this, the daily average PM2.5 concentration (µg/m³) is the main variable we’re investigating in this assignment. The sumamry data shows dataset 2002 has mean value 16.12 ug/m3 and max. Maxvalue is 104.3ug/m3. Distribution: right-skewed, with many moderate values and some extreme outliers. However, dataset2022 has implausible values, the max value is 302.5 ug/m3 and mean is 8.414, and these -ve values would be removed in later section after combination of two datasets. Distribution: right-skewed, lower central tendency compared to 2002, but with occasional extreme peaks (likely wildfire smoke events). PM2.5 levels in 2022 are, on average, about half of 2002 levels, showing long-term improvement in California’s air quality. However, 2022 shows some very high outliers (>300 µg/m³), which probably correspond to episodic wildfire pollution events. overall, Data quality is good; no missing PM values. The key PM2.5 variable shows clear right-skewness in both years. Long-term air quality has improved substantially, though extreme pollution events remain visible in 2022.

#2 combine and rename:

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
data2002 <- data2002 %>%
  mutate(Year = 2002)

data2022 <- data2022 %>%
  mutate(Year = 2022)

data_all <- bind_rows(data2002, data2022)
library(dplyr)

names(data_all)[names(data_all) == "Daily.Mean.PM2.5.Concentration"] <- "PM25"

names(data_all)[names(data_all) == "Site.Latitude"] <- "Latitude"
names(data_all)[names(data_all) == "Site.Longitude"] <- "Longitude"
names(data_all)[names(data_all) == "Site.ID"] <- "Site"

names(data_all)
 [1] "Date"                      "Source"                   
 [3] "Site"                      "POC"                      
 [5] "PM25"                      "Units"                    
 [7] "Daily.AQI.Value"           "Local.Site.Name"          
 [9] "Daily.Obs.Count"           "Percent.Complete"         
[11] "AQS.Parameter.Code"        "AQS.Parameter.Description"
[13] "Method.Code"               "Method.Description"       
[15] "CBSA.Code"                 "CBSA.Name"                
[17] "State.FIPS.Code"           "State"                    
[19] "County.FIPS.Code"          "County"                   
[21] "Latitude"                  "Longitude"                
[23] "Year"                     
data_all <- data_all %>%
  mutate(
    issue = case_when(
      is.na(PM25) ~ "Missing",
      PM25 < 0 ~ "Negative",
      TRUE ~ "Valid"
    )
  )

issue_summary <- data_all %>%
  group_by(Year) %>%
  summarise(
    total = n(),
    missing = sum(issue == "Missing"),
    negative = sum(issue == "Negative"),
    prop_missing = mean(issue == "Missing"),
    prop_negative = mean(issue == "Negative")
  )

issue_summary
# A tibble: 2 × 6
   Year total missing negative prop_missing prop_negative
  <dbl> <int>   <int>    <int>        <dbl>         <dbl>
1  2002 15976       0        0            0       0      
2  2022 59918       0      215            0       0.00359

No missing values in either year. Negative values appear only in 2022, though the proportion is very small (~0.36%). This suggests a slight increase in measurement or data entry errors in the later year, but overall data quality is good. Maximum PM2.5 value (~300) is plausible and kept.

library(dplyr)
library(ggplot2)

issue_plot_data <- data_all %>%
  mutate(issue = case_when(
    PM25 < 0 ~ "Negative",
    TRUE ~ "Valid"
  )) %>%
  group_by(Year, issue) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(Year) %>%
  mutate(prop = count / sum(count))


ggplot(issue_plot_data, aes(x = factor(Year), y = prop, fill = issue)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 0.1)) +
  labs(
    title = "Proportion of Valid and Negative PM2.5 Values by Year",
    x = "Year",
    y = "Proportion",
    fill = "Data Issue"
  ) +
  theme_minimal(base_size = 14)

#3:

library(leaflet)


pal <- colorFactor(
  palette = c("purple", "green"),
  domain = c(2002, 2022)
)

leaflet(data_all) %>%
  addTiles() %>%
  addCircleMarkers(
    ~Longitude, ~Latitude,
    color = ~pal(Year),
    radius = 5,
    fillOpacity = 0.7,
    popup = ~paste("Year:", Year)
  ) %>%
  addLegend(
    "bottomright", 
    pal = pal, 
    values = ~Year,
    title = "Year",
    opacity = 1
  )

2002 sites (purple): clustered around major urban areas and some regional monitoring stations. 2022 sites (green): many overlap with 2002 locations, but some new sites appear in additional regions, indicating expanded coverage. Observation: The overall spatial distribution is similar, but there are more monitoring sites in 2022, especially in previously under-monitored areas.

#5:

level1: state level analysis

data_all_clean <- data_all %>%
  filter(PM25 >= 0)

state_summary <- data_all_clean %>%
  group_by(Year) %>%
  summarise(
    mean_PM25 = mean(PM25, na.rm = TRUE),
    median_PM25 = median(PM25, na.rm = TRUE),
    sd_PM25 = sd(PM25, na.rm = TRUE),
    min_PM25 = min(PM25, na.rm = TRUE),
    max_PM25 = max(PM25, na.rm = TRUE),
    .groups = "drop"
  )

print("State-level summary:")
[1] "State-level summary:"
print(state_summary)
# A tibble: 2 × 6
   Year mean_PM25 median_PM25 sd_PM25 min_PM25 max_PM25
  <dbl>     <dbl>       <dbl>   <dbl>    <dbl>    <dbl>
1  2002     16.1         12     13.9         0     104.
2  2022      8.45         6.8    7.63        0     302.
ggplot(data_all_clean, aes(x = PM25, fill = factor(Year))) +
  geom_histogram(alpha = 0.6, position = "identity", bins = 50) +
  scale_fill_manual(values = c("2002" = "blue", "2022" = "red")) +
  labs(title = "Distribution of PM2.5 in California by Year",
       x = "PM2.5 (µg/m³)", fill = "Year") +
  theme_minimal()

ggplot(data_all_clean, aes(x = factor(Year), y = PM25, fill = factor(Year))) +
  geom_boxplot() +
  scale_fill_manual(values = c("2002" = "blue", "2022" = "red")) +
  labs(title = "PM2.5 Distribution in California by Year",
       x = "Year", y = "PM2.5 (µg/m³)") +
  theme_minimal()

level2:

data_all_clean <- data_all %>%
  filter(PM25 >= 0)
county_summary <- data_all_clean %>%
  group_by(County, Year) %>%
  summarise(
    mean_PM25 = mean(PM25, na.rm = TRUE),
    median_PM25 = median(PM25, na.rm = TRUE),
    sd_PM25 = sd(PM25, na.rm = TRUE),
    .groups = "drop"
  )

print("County-level summary:")
[1] "County-level summary:"
print(county_summary)
# A tibble: 98 × 5
   County        Year mean_PM25 median_PM25 sd_PM25
   <chr>        <dbl>     <dbl>       <dbl>   <dbl>
 1 Alameda       2002     14.3         10     11.4 
 2 Alameda       2022      8.21         7      4.95
 3 Butte         2002     14.8         11.5   11.7 
 4 Butte         2022      6.26         4.5    5.78
 5 Calaveras     2002      9.9          8      6.50
 6 Calaveras     2022      6.04         5      4.10
 7 Colusa        2002     11.7          9     10.0 
 8 Colusa        2022      7.61         6.7    4.76
 9 Contra Costa  2002     15.1          9.5   14.5 
10 Contra Costa  2022      8.24         7.2    4.93
# ℹ 88 more rows
ggplot(data_all_clean, aes(x = reorder(County, PM25, FUN = median), y = PM25, fill = factor(Year))) +
  geom_boxplot() +
  scale_fill_manual(values = c("2002" = "blue", "2022" = "red")) +
  coord_flip() +
  labs(title = "PM2.5 by County in California",
       x = "County", y = "PM2.5 (µg/m³)", fill = "Year") +
  theme_minimal()

level3 site level:

la_sites <- data_all %>%
  filter(County == "Los Angeles")

la_summary <- la_sites %>%
  group_by(Site, Year) %>%
  summarise(
    mean_PM25 = mean(PM25, na.rm = TRUE),
    median_PM25 = median(PM25, na.rm = TRUE),
    sd_PM25 = sd(PM25, na.rm = TRUE),
    .groups = "drop"
  )

la_summary
# A tibble: 25 × 5
       Site  Year mean_PM25 median_PM25 sd_PM25
      <int> <dbl>     <dbl>       <dbl>   <dbl>
 1 60370002  2002     20.8        18.7    12.1 
 2 60370002  2022      9.72        9.65    4.39
 3 60370016  2022      8.42        7.8     5.47
 4 60371002  2002     24.0        21.6    12.7 
 5 60371103  2002     22.0        19.3    11.7 
 6 60371103  2022     11.6        10.9     4.57
 7 60371201  2002     18.9        17.0    10.7 
 8 60371201  2022     10.7        10.3     4.56
 9 60371301  2002     23.3        19.8    12.0 
10 60371302  2022     13.0        11.9     6.22
# ℹ 15 more rows
ggplot(la_sites, aes(x = reorder(Site, PM25, FUN = median), y = PM25, fill = factor(Year))) +
  geom_boxplot() +
  scale_fill_manual(values = c("2002" = "blue", "2022" = "red")) +
  coord_flip() +
  labs(title = "PM2.5 by Monitoring Site in Los Angeles County",
       x = "Site", y = "PM2.5 (µg/m³)", fill = "Year") +
  theme_minimal()

First plot: Statewide PM2.5 Distribution. X-axis: Year (2002 vs. 2022). Y-axis: PM2.5 concentration (µg/m³). Observations: Median PM2.5 has decreased from 2002 to 2022. The interquartile range (IQR) is narrower in 2022, suggesting less variability in PM2.5 across the state.Both years show some extreme outliers, but 2022 has more very high outliers (up to ~300 µg/m³), indicating occasional severe pollution events.

Second plot: The county-level boxplots show that PM2.5 is generally higher in urbanized counties, with Los Angeles and Riverside consistently above other counties. In most counties, 2022 PM2.5 values are slightly higher than 2002, and the variability is larger in more densely populated areas. The site-level boxplots in Los Angeles County reveal that some sites experience higher PM2.5 than others, reflecting local pollution sources. Temporal changes between 2002 and 2022 vary by site, with some showing increases, indicating that air quality trends are highly site-specific within the county.